pacman::p_load(readxl, tidyverse)Working Script
1. Load necessary libraries
2. Load the dataset
vaByMarketRaw <- read_excel("outputFile -International Visitor Arrivals.xlsx", sheet = "T1", skip = 10)
vaByDemoRaw <- read_excel("outputFile -International Visitor Arrivals.xlsx", sheet = "T2", skip = 10)
vaByStayRaw <- read_excel("outputFile -International Visitor Arrivals.xlsx", sheet = "T3", skip = 9)3. Tourism Markets
3.1 Cleansing the dataset
This file
- contains the international visitor arrivals by inbound tourism markets (monthly)
- excludes arrivals of Malaysians by land
- feb 1991 has a sharp decline due to Gulf crisis
- data for Germany prior to 1991 refers to West Germany only
- all numbers are counts
The following was done to cleanup the vaByMarketRaw dataframe:
- remove the bottom few rows as they were unnecessary for our visualizations
- create a new column to assign the value of Region to the respective Countries
- rename fields and rearrange the columns
- filter out rows that are not needed anymore
- pivot date (month-year) and the number of visitors to reduce the number of columns
vaByMarket <- slice(vaByMarketRaw, 2:(62))
colnames(vaByMarket)[1] <- "Data"
vaByMarket$Region <-
ifelse(vaByMarket$Data %in% c("Brunei Darussalam", "Indonesia", "Malaysia", "Myanmar", "Philippines", "Thailand", "Vietnam", "Other Markets In Southeast Asia"), "Southeast Asia",
ifelse(vaByMarket$Data %in% c("China", "Hong Kong SAR", "Taiwan", "Other Markets In Greater China"), "Greater China",
ifelse(vaByMarket$Data %in% c("Japan", "South Korea", "Other Markets In North Asia"), "North Asia",
ifelse(vaByMarket$Data %in% c("Bangladesh", "India", "Pakistan", "Sri Lanka", "Other Markets In South Asia"), "South Asia",
ifelse(vaByMarket$Data %in% c("Iran", "Israel", "Kuwait", "Saudi Arabia", "United Arab Emirates", "Other Markets In West Asia"), "West Asia",
ifelse(vaByMarket$Data %in% c("Canada", "USA", "Other Markets In Americas"), "Americas",
ifelse(vaByMarket$Data %in% c("Belgium & Luxembourg", "Denmark", "Finland", "France", "Germany", "Italy", "Netherlands", "Norway", "Rep Of Ireland", "Russian Federation", "Spain", "Sweden", "Switzerland", "United Kingdom", "Other Markets In Europe"), "Europe",
ifelse(vaByMarket$Data %in% c("Australia", "New Zealand", "Other Markets In Oceania"), "Oceania",
ifelse(vaByMarket$Data %in% c("Egypt", "Mauritius", "South Africa (Rep Of)", "Other Markets In Africa"), "Africa",
ifelse(vaByMarket$Data %in% c("Others"), "Others", "NA"
))))))))))
vaByMarket <- vaByMarket %>%
select(542, 1, 2:541)
colnames(vaByMarket)[2] <- "Country"
# sapply(vaByMarket, class)
vaByMarket <- vaByMarket %>%
mutate_at(vars(-one_of("Region", "Country")), as.numeric) %>%
filter(vaByMarket$Region != "NA") %>%
pivot_longer(cols = ! c("Region", "Country"), names_to = "Period", values_to = "Visitors") %>%
mutate(Period = as.Date(paste(Period, "01"), "%Y %B %d")) %>%
mutate(Year = year(Period))3.2. Preparations for visualizations
to find the total number of visitors (regardless of date range):
totalVisitors <- sum(vaByMarket$Visitors, na.rm = TRUE)
totalVisitors[1] 325005202
to find the min and max year:
minYear <- min(year(vaByMarket$Period))
minYear[1] 1978
maxYear <- max(year(vaByMarket$Period))
maxYear[1] 2022
to find the country where most visitors are from:
visitorsByCountry <- vaByMarket %>%
group_by(Country) %>%
summarize(Visitors = sum(Visitors, na.rm = TRUE))
visitorsByCountry <- visitorsByCountry[order(visitorsByCountry$Visitors, decreasing = TRUE), ]
visitorsByCountry# A tibble: 52 × 2
Country Visitors
<chr> <dbl>
1 Indonesia 50918113
2 China 34648115
3 Japan 29394650
4 Australia 24989059
5 Malaysia 21009122
6 India 20987273
7 United Kingdom 15116531
8 USA 14620454
9 South Korea 11942521
10 Hong Kong SAR 11765677
# … with 42 more rows
mostFrom <- head(visitorsByCountry$Country, 1)
mostFrom[1] "Indonesia"
to find the number of countries that visited us:
temp <- vaByMarket[vaByMarket$Year >= 1984 & vaByMarket$Year <= 1990, ]
numCountries <- nrow(temp %>%
filter(!is.na(Visitors)) %>%
count(Country))
numCountries[1] 33
to list out regions
listRegions <- unique(vaByMarket$Region)
listRegions [1] "Southeast Asia" "Greater China" "North Asia" "South Asia"
[5] "West Asia" "Americas" "Europe" "Oceania"
[9] "Africa" "Others"
to list out countries
listCountry <- unique(vaByMarket$Country)
listCountry [1] "Brunei Darussalam" "Indonesia"
[3] "Malaysia" "Myanmar"
[5] "Philippines" "Thailand"
[7] "Vietnam" "Other Markets In Southeast Asia"
[9] "China" "Hong Kong SAR"
[11] "Taiwan" "Other Markets In Greater China"
[13] "Japan" "South Korea"
[15] "Other Markets In North Asia" "Bangladesh"
[17] "India" "Pakistan"
[19] "Sri Lanka" "Other Markets In South Asia"
[21] "Iran" "Israel"
[23] "Kuwait" "Saudi Arabia"
[25] "United Arab Emirates" "Other Markets In West Asia"
[27] "Canada" "USA"
[29] "Other Markets In Americas" "Belgium & Luxembourg"
[31] "Denmark" "Finland"
[33] "France" "Germany"
[35] "Italy" "Netherlands"
[37] "Norway" "Rep Of Ireland"
[39] "Russian Federation" "Spain"
[41] "Sweden" "Switzerland"
[43] "United Kingdom" "Other Markets In Europe"
[45] "Australia" "New Zealand"
[47] "Other Markets In Oceania" "Egypt"
[49] "Mauritius" "South Africa (Rep Of)"
[51] "Other Markets In Africa" "Others"
3.3. Visualizations
plotting visitors across time chart:
install the timetk package (recommended by Prof Kam)
pacman::p_load(timetk, lubridate, ggplot2, plotly, ggHoriPlot, sunburstR, d3r)Country/Region Sunburst Plot
vaByMarket# A tibble: 28,080 × 5
Region Country Period Visitors Year
<chr> <chr> <date> <dbl> <dbl>
1 Southeast Asia Brunei Darussalam 2022-12-01 7324 2022
2 Southeast Asia Brunei Darussalam 2022-11-01 4273 2022
3 Southeast Asia Brunei Darussalam 2022-10-01 3733 2022
4 Southeast Asia Brunei Darussalam 2022-09-01 3809 2022
5 Southeast Asia Brunei Darussalam 2022-08-01 3426 2022
6 Southeast Asia Brunei Darussalam 2022-07-01 3239 2022
7 Southeast Asia Brunei Darussalam 2022-06-01 2202 2022
8 Southeast Asia Brunei Darussalam 2022-05-01 2271 2022
9 Southeast Asia Brunei Darussalam 2022-04-01 807 2022
10 Southeast Asia Brunei Darussalam 2022-03-01 280 2022
# … with 28,070 more rows
country_hier <- vaByMarket %>%
group_by(Region, Country) %>%
summarize(Visitors = sum(Visitors, na.rm = TRUE))
country_sunburst <- d3_nest(country_hier, value_cols = "Visitors")
sunburst(data = country_sunburst,
valueField = "Visitors",
height = 300,
width = "100%",
legend = FALSE) Time Series Plot for Overall Trend (timetk)
timeSeriesOverall <- vaByMarket %>%
group_by(Period) %>%
summarise(Visitors = sum(Visitors, na.rm = TRUE))
timeSeriesOverall %>%
plot_time_series(Period, Visitors, .interactive = TRUE)Boxplot Time Series
timeSeriesOverall %>%
plot_time_series(Period, Visitors, .interactive = TRUE)Time Series by Country
minYear <- 2018
maxYear <- 2022
vaByCountry <- vaByMarket %>%
filter(Country %in% "Russian Federation")
vaByCountry <- vaByCountry[vaByCountry$Year >= minYear & vaByCountry$Year <= maxYear, ]
vaByCountry[is.na(vaByCountry)] = 0
timeSeriesCountry <- vaByCountry %>%
group_by(Period) %>%
summarise(Visitors = sum(Visitors, na.rm = TRUE))
timeSeriesCountry %>%
plot_time_series(Period, Visitors, .interactive = TRUE)Cycle Plot by Country
countryAvg <- vaByCountry %>%
mutate(Month = as.character(Period, format = '%b')) %>%
group_by(Month) %>%
summarise(avgValue = mean(Visitors)) %>%
mutate(avgValue = round(avgValue/1000, digits = 0))
cyclePlot <- vaByCountry %>%
group_by(Period) %>%
summarise(Visitors = sum(Visitors, na.rm = TRUE)) %>%
mutate(Month = as.character(Period, format = '%b')) %>%
mutate(Visitors = round(Visitors/1000, digits = 0))
figCyclePlot <- ggplot() +
geom_line(data = cyclePlot, aes(x = Period, y = Visitors, group = Month)) +
geom_hline(data = countryAvg, aes(yintercept = avgValue), colour = "red", size = 0.3) +
labs(x = "Date", y = "No. of Visitors", title = paste("Visitor Arrivals by Country, ", minYear, " to ", maxYear)) +
facet_grid(~ factor(Month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))) +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
ggplotly(figCyclePlot)Month/Year Heatmap by Country
vaByMarketCal <- vaByMarket %>%
mutate(Month = month(Period)) %>%
filter(Country %in% "Russian Federation")
vaByMarketCal <- vaByMarketCal[vaByMarketCal$Year >= minYear & vaByMarketCal$Year <= maxYear, ]
vaByMarketCal[is.na(vaByMarketCal)] = 0
scaleFUN <- function(x) sprintf("%.0f", x)
figHeatmapPlot <- ggplot(data = vaByMarketCal) +
geom_tile(aes(x = Month, y = Year, fill = Visitors)) +
labs(title = paste("Visitor Arrivals by Country, ", minYear, " to ", maxYear)) +
theme_bw() +
theme(legend.position = "none") +
scale_x_continuous(breaks = seq_along(month.name), labels = month.abb) +
scale_y_continuous(labels = scaleFUN)
ggplotly(figHeatmapPlot)Compare two regions using the Horizon Plot for Regional Trend?
vaByMarket %>%
group_by(Period, Region) %>%
summarise(Visitors = sum(Visitors)) %>%
filter(Region == "South Asia") %>%
ggplot() +
geom_horizon(aes(x = Period, y = Visitors), origin = "midpoint", horizonscale = 6) +
facet_grid(Region~.) +
scale_fill_hcl(palette = "RdBu") +
scale_x_date(expand = c(0,0), date_breaks = "3 year", date_labels = " %b %y") +
theme_minimal() +
theme(panel.spacing.y = unit(0, "lines"),
strip.text.y = element_text(size = 10, angle = 0, hjust = 0),
legend.position = 'none',
axis.text.y = element_blank(),
axis.text.x = element_text(size = 7),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
axis.ticks.y = element_blank(),
panel.border = element_blank())
Time Series Plot for Regional Trend (ggplotly)
timeseriesRegion <- vaByMarket %>%
group_by(Region, Period) %>%
summarise(Visitors = sum(Visitors, na.rm = TRUE)) %>%
mutate(Visitors = round(Visitors/1000, digits = 0))
figTimeseriesRegion <- ggplot(data = timeseriesRegion, aes(x = Period, y = Visitors)) +
geom_line(aes(colour = Region)) +
labs(x = "Date", y = "No. of Visitors (K)", title = "Overall Trend of Visitor Arrivals by Region, 1978 to 2022") +
theme_bw()
ggplotly(figTimeseriesRegion)Time Series Plot for Country Trend (ggplotly)
timeseriesCountry <- vaByMarket %>%
group_by(Country, Period) %>%
summarise(Visitors = sum(Visitors, na.rm = TRUE)) %>%
mutate(Visitors = round(Visitors/1000, digits = 0))
figTimeseriesCountry <- ggplot(data = timeseriesCountry, aes(x = Period, y = Visitors)) +
geom_line(aes(colour = Country)) +
labs(x = "Date", y = "No. of Visitors (K)", title = "Overall Trend of Visitor Arrivals, 1978 to 2022") +
theme_bw()
ggplotly(figTimeseriesCountry)3.4. Forecasting
reference: https://www.r-bloggers.com/2020/06/introducing-modeltime-tidy-time-series-forecasting-using-tidymodels/
loading libraries
pacman::p_load(modeltime, kernlab, reactable, tidymodels)only keep the date and value columns for forecasting
vaByMarketTable <- vaByMarket %>%
group_by(Period) %>%
summarise(visitors = round(sum(Visitors, na.rm = TRUE))/1000) %>% # converted to thousands
set_names(c("date", "value"))
vaByMarketTable# A tibble: 540 × 2
date value
<date> <dbl>
1 1978-01-01 99.0
2 1978-02-01 87.0
3 1978-03-01 95.5
4 1978-04-01 84.2
5 1978-05-01 89.7
6 1978-06-01 78.5
7 1978-07-01 96.4
8 1978-08-01 113.
9 1978-09-01 87.8
10 1978-10-01 99.6
# … with 530 more rows
let’s create a train/test set”
splits <- vaByMarketTable %>%
time_series_split(
assess = "3 years",
cumulative = TRUE
)
splits<Analysis/Assess/Total>
<504/36/540>
let’s plot the train/test split to visualize the split:
splits %>%
tk_time_series_cv_plan() %>%
plot_time_series_cv_plan(date, value, .interactive = FALSE)
we will modelling using modeltime and parsnip libraries
Basic Auto ARIMA fitting
modelFitArima <- arima_reg(
non_seasonal_ar = 2,
non_seasonal_differences = 1,
non_seasonal_ma = 2
) %>%
set_engine("auto_arima") %>%
fit(value ~ date, training(splits))
modelFitArimaparsnip model object
Series: outcome
ARIMA(2,1,2)(0,1,2)[12]
Coefficients:
ar1 ar2 ma1 ma2 sma1 sma2
-0.0794 0.7370 -0.0192 -0.9356 -0.5090 -0.1233
s.e. 0.0442 0.0444 0.0240 0.0251 0.0453 0.0401
sigma^2 = 1498: log likelihood = -2492.86
AIC=4999.71 AICc=4999.94 BIC=5029.09
Prophet
modelFitProphet <- prophet_reg() %>%
set_engine("prophet", yearly.seasonality = TRUE) %>%
fit(value ~ date, training(splits))
modelFitProphetparsnip model object
PROPHET Model
- growth: 'linear'
- n.changepoints: 25
- changepoint.range: 0.8
- yearly.seasonality: 'auto'
- weekly.seasonality: 'auto'
- daily.seasonality: 'auto'
- seasonality.mode: 'additive'
- changepoint.prior.scale: 0.05
- seasonality.prior.scale: 10
- holidays.prior.scale: 10
- logistic_cap: NULL
- logistic_floor: NULL
- extra_regressors: 0
Machine Learning Models - create a pre-processing recipe - create model specs - use workflow to combine model specs, pre-processing and fit model
Pre-processing Recipe:
recipeSpecs <- recipe(value ~ date, training(splits)) %>%
step_timeseries_signature(date) %>%
step_dummy(all_nominal())
recipeSpecs %>%
prep() %>%
juice()# A tibble: 504 × 44
date value date_index…¹ date_…² date_…³ date_…⁴ date_…⁵ date_…⁶ date_…⁷
<date> <dbl> <dbl> <int> <int> <int> <int> <int> <int>
1 1978-01-01 99.0 252460800 1978 1977 1 1 1 0
2 1978-02-01 87.0 255139200 1978 1978 1 1 2 1
3 1978-03-01 95.5 257558400 1978 1978 1 1 3 2
4 1978-04-01 84.2 260236800 1978 1978 1 2 4 3
5 1978-05-01 89.7 262828800 1978 1978 1 2 5 4
6 1978-06-01 78.5 265507200 1978 1978 1 2 6 5
7 1978-07-01 96.4 268099200 1978 1978 2 3 7 6
8 1978-08-01 113. 270777600 1978 1978 2 3 8 7
9 1978-09-01 87.8 273456000 1978 1978 2 3 9 8
10 1978-10-01 99.6 276048000 1978 1978 2 4 10 9
# … with 494 more rows, 35 more variables: date_day <int>, date_hour <int>,
# date_minute <int>, date_second <int>, date_hour12 <int>, date_am.pm <int>,
# date_wday <int>, date_wday.xts <int>, date_mday <int>, date_qday <int>,
# date_yday <int>, date_mweek <int>, date_week <int>, date_week.iso <int>,
# date_week2 <int>, date_week3 <int>, date_week4 <int>, date_mday7 <int>,
# date_month.lbl_01 <dbl>, date_month.lbl_02 <dbl>, date_month.lbl_03 <dbl>,
# date_month.lbl_04 <dbl>, date_month.lbl_05 <dbl>, …
once the recipe is ready, we can set up the machine learning pipelines.
Elastic Net
- we will first build the model
- next, we will make that model into a fitted workflow
modelSpec_glmnet <- linear_reg(penalty = 0.01, mixture = 0.05) %>%
set_engine("glmnet")
workflowFit_glmnet <- workflow() %>%
add_model(modelSpec_glmnet) %>%
add_recipe(recipeSpecs %>%
step_rm(date)) %>%
fit(training(splits))Random Forest - similar process as the Elastic Net
modelSpec_rf <- rand_forest(trees = 500, min_n = 50, mode = "regression") %>%
set_engine("randomForest")
workflowFit_rf <- workflow() %>%
add_model(modelSpec_rf) %>%
add_recipe(recipeSpecs %>%
step_rm(date)) %>%
fit(training(splits))Adding new models - XGBoost - SVM
XGBoost:
recipeSpecsParsnip <- recipeSpecs %>%
update_role(date, new_role = "ID")
workflowFit_xgboost <- workflow() %>%
add_model(
boost_tree(
trees = 500,
min_n = 1,
tree_depth = 15,
mode = "regression"
) %>%
set_engine("xgboost")
) %>%
add_recipe(recipeSpecsParsnip) %>%
fit(training(splits))SVM:
workflowFit_svm <- workflow() %>%
add_model(
svm_rbf(
cost = 1,
margin = 0.1
) %>%
set_engine("kernlab") %>%
set_mode("regression")
) %>%
add_recipe(recipeSpecsParsnip) %>%
fit(training(splits))New Hybrid Models
- these combine automated algorithms with machine learning
Prophet Boost
- algorithm works by modelling the univariate series using Prophet
- uses regresses supplied via the preprocessing rrrcipe
- regresses the prophet residuals with the XGBoost model
modelSpec_prophetBoost <- prophet_boost() %>%
set_engine("prophet_xgboost", yearly.seasonality = TRUE)
workflowFit_prophetBoost <- workflow() %>%
add_model(modelSpec_prophetBoost) %>%
add_recipe(recipeSpecs) %>%
fit(training(splits))ARIMA Boosted:
workflowFit_arimaBoost <- workflow() %>%
add_model(
arima_boost(
non_seasonal_ar = 2,
non_seasonal_differences = 1,
non_seasonal_ma = 2,
trees = 500,
min_n = 1,
tree_depth = 15
) %>%
set_engine("auto_arima_xgboost")
) %>%
add_recipe(recipeSpecs) %>%
fit(training(splits))Modeltime Table
- organizes the models with IDs and creates generic descriptions too help us keep track of our models
modelTable <- modeltime_table(
modelFitArima,
modelFitProphet,
workflowFit_glmnet,
workflowFit_rf,
workflowFit_svm,
workflowFit_xgboost,
workflowFit_arimaBoost,
workflowFit_prophetBoost
) %>%
update_model_description(1, "ARIMA") %>%
update_model_description(2, "Prophet") %>%
update_model_description(3, "glmNet") %>%
update_model_description(4, "Random Forest") %>%
update_model_description(5, "SVM") %>%
update_model_description(6, "XGBoost") %>%
update_model_description(7, "ARIMA Boost") %>%
update_model_description(8, "Prophet Boost")
modelTable# Modeltime Table
# A tibble: 8 × 3
.model_id .model .model_desc
<int> <list> <chr>
1 1 <fit[+]> ARIMA
2 2 <fit[+]> Prophet
3 3 <workflow> glmNet
4 4 <workflow> Random Forest
5 5 <workflow> SVM
6 6 <workflow> XGBoost
7 7 <workflow> ARIMA Boost
8 8 <workflow> Prophet Boost
Calibration
- model calibration is used to quantify error and estimate confidence intervals
- two columns will be generated: .type and .calibration_data
- .calibration_data includes the actual values, fitted values and the residuals for the testing set
calibrationTable <- modelTable %>%
modeltime_calibrate(testing(splits))
calibrationTable# Modeltime Table
# A tibble: 8 × 5
.model_id .model .model_desc .type .calibration_data
<int> <list> <chr> <chr> <list>
1 1 <fit[+]> ARIMA Test <tibble [36 × 4]>
2 2 <fit[+]> Prophet Test <tibble [36 × 4]>
3 3 <workflow> glmNet Test <tibble [36 × 4]>
4 4 <workflow> Random Forest Test <tibble [36 × 4]>
5 5 <workflow> SVM Test <tibble [36 × 4]>
6 6 <workflow> XGBoost Test <tibble [36 × 4]>
7 7 <workflow> ARIMA Boost Test <tibble [36 × 4]>
8 8 <workflow> Prophet Boost Test <tibble [36 × 4]>
Forecast (Testing Set)
- with the calibrated data, we can visualize testing predictions (forecast)
forecastTable <- calibrationTable %>%
modeltime_forecast(actual_data = vaByMarketTable)
forecastTable# A tibble: 828 × 7
.model_id .model_desc .key .index .value .conf_lo .conf_hi
<int> <chr> <fct> <date> <dbl> <dbl> <dbl>
1 NA ACTUAL actual 1978-01-01 99.0 NA NA
2 NA ACTUAL actual 1978-02-01 87.0 NA NA
3 NA ACTUAL actual 1978-03-01 95.5 NA NA
4 NA ACTUAL actual 1978-04-01 84.2 NA NA
5 NA ACTUAL actual 1978-05-01 89.7 NA NA
6 NA ACTUAL actual 1978-06-01 78.5 NA NA
7 NA ACTUAL actual 1978-07-01 96.4 NA NA
8 NA ACTUAL actual 1978-08-01 113. NA NA
9 NA ACTUAL actual 1978-09-01 87.8 NA NA
10 NA ACTUAL actual 1978-10-01 99.6 NA NA
# … with 818 more rows
Plotting the forecasts:
forecastPlots <- forecastTable %>%
filter(.model_desc != "ACTUAL") %>%
plot_modeltime_forecast(
.interactive = FALSE,
.facet_vars = .model_desc,
.facet_ncol = 4,
.facet_scales = "fixed",
.legend_show = FALSE
) +
geom_line(
data = forecastTable %>%
filter(.model_desc == "ACTUAL") %>%
select(.index, .value),
size = 0.3
) +
labs(
title = "", y = "No. of Visitors", x = "Year"
) +
theme_bw() +
theme(legend.position = "none")
ggplotly(forecastPlots)Another way to plot accuracy:
accuracyTable <- calibrationTable %>%
modeltime_accuracy() %>%
select(.model_desc, mae, mape, mase, smape, rmse, rsq) %>%
reactable(
columns = list(
.model_desc = colDef(name = "Model"),
mae = colDef(name = "Mean Absolute Error", format = colFormat(digits = 2)),
mape = colDef(name = "Mean Absolute Percentage Error", format = colFormat(digits = 2)),
mase = colDef(name = "Mean Absolute Scaled Error", format = colFormat(digits = 2)),
smape = colDef(name = "Symmetric Mean Absolute Percentage Error", format = colFormat(digits = 2)),
rmse = colDef(name = "Root Mean Squared Error", format = colFormat(digits = 2)),
rsq = colDef(name = "R-Squared", format = colFormat(digits = 2))
),
highlight = TRUE,
striped = TRUE,
searchable = TRUE,
showPageSizeOptions = FALSE
)
accuracyTableNow that we have done forecasting, let’s do some refitting:
refitTable <- calibrationTable %>%
modeltime_refit(vaByMarketTable) %>%
modeltime_forecast(h = "3 years",
actual_data = vaByMarketTable) %>%
mutate(.model_desc = str_replace_all(.model_desc, "UPDATE: ARIMA\\(1,1,0\\)\\(2,0,0\\)\\[12\\]", "ARIMA")) %>%
mutate(.model_desc = str_replace_all(.model_desc, "W/ XGBOOST ERRORS", "Boost"))
refitTable# A tibble: 828 × 7
.model_id .model_desc .key .index .value .conf_lo .conf_hi
<int> <chr> <fct> <date> <dbl> <dbl> <dbl>
1 NA ACTUAL actual 1978-01-01 99.0 NA NA
2 NA ACTUAL actual 1978-02-01 87.0 NA NA
3 NA ACTUAL actual 1978-03-01 95.5 NA NA
4 NA ACTUAL actual 1978-04-01 84.2 NA NA
5 NA ACTUAL actual 1978-05-01 89.7 NA NA
6 NA ACTUAL actual 1978-06-01 78.5 NA NA
7 NA ACTUAL actual 1978-07-01 96.4 NA NA
8 NA ACTUAL actual 1978-08-01 113. NA NA
9 NA ACTUAL actual 1978-09-01 87.8 NA NA
10 NA ACTUAL actual 1978-10-01 99.6 NA NA
# … with 818 more rows
Now we will use the refitted data and do forward forecasting:
refitPlots <- refitTable %>%
filter(.model_desc != "ACTUAL") %>%
plot_modeltime_forecast(
.interactive = FALSE,
.facet_vars = .model_desc,
.facet_ncol = 4,
.facet_scales = "fixed",
.legend_show = FALSE
) +
geom_line(
data = forecastTable %>%
filter(.model_desc == "ACTUAL") %>%
select(.index, .value),
size = 0.3
) +
labs(
title = ""
) +
theme_bw() +
theme(legend.position = "none")
ggplotly(refitPlots)Now we have our forecasts and refit + forecasts. Let’s plot the residuals now:
residualsTable <- calibrationTable %>%
modeltime_residuals()
residualsPlots <- residualsTable %>%
plot_modeltime_residuals(
.interactive = FALSE,
.type = "timeplot",
.facet_vars = .model_desc,
.facet_ncol = 4,
.facet_scales = "fixed"
) +
labs(
title = "",
y = "Residuals"
) +
theme_bw() +
theme(legend.position = "none")
ggplotly(residualsPlots)xxx
4. Visitors’ Demographics
4.1. Cleansing the dataset
This file
- contains the international visitor arrivals by gender and age groups (monthly for the last 6 months of dec)
- excludes arrivals of Malaysians by land
- all numbers are counts
The following was done to cleanup the vaByDemoRaw dataframe:
- remove the bottom few rows as they were unnecessary for our visualizations
- rename fields and rearrange the columns
- split the dataframe into two, one to contain info on gender and one to contain info on age
- next, we will pivot date (month-year) and the number of visitors to reduce the number of columns
vaByDemo <- slice(vaByDemoRaw, 2:11)
vaByDemoGender <- slice(vaByDemo, 1:2)
vaByDemoAge <- slice(vaByDemo, 3:11)Cleansing Gender dataframe:
colnames(vaByDemoGender)[1] <- "Gender"
vaByDemoGender <- vaByDemoGender %>%
pivot_longer(cols = !c("Gender"), names_to = "Period", values_to = "Visitors") %>%
mutate(Period = as.Date(paste(Period, "01"), "%Y %B %d")) %>%
mutate(Year = year(Period))
vaByDemoGender# A tibble: 12 × 4
Gender Period Visitors Year
<chr> <date> <dbl> <dbl>
1 Males 2022-12-01 457823 2022
2 Males 2022-11-01 428447 2022
3 Males 2022-10-01 423112 2022
4 Males 2022-09-01 419664 2022
5 Males 2022-08-01 375963 2022
6 Males 2022-07-01 371596 2022
7 Females 2022-12-01 473507 2022
8 Females 2022-11-01 387885 2022
9 Females 2022-10-01 393712 2022
10 Females 2022-09-01 362542 2022
11 Females 2022-08-01 352779 2022
12 Females 2022-07-01 355136 2022
Cleansing Age dataframe:
colnames(vaByDemoAge)[1] <- "AgeGroup"
vaByDemoAge <- vaByDemoAge %>%
pivot_longer(cols = !c("AgeGroup"), names_to = "Period", values_to = "Visitors") %>%
mutate(Period = as.Date(paste(Period, "01"), "%Y %B %d")) %>%
mutate(Year = year(Period))
vaByDemoAge# A tibble: 48 × 4
AgeGroup Period Visitors Year
<chr> <date> <dbl> <dbl>
1 Under 15 Years 2022-12-01 123734 2022
2 Under 15 Years 2022-11-01 48421 2022
3 Under 15 Years 2022-10-01 63829 2022
4 Under 15 Years 2022-09-01 50827 2022
5 Under 15 Years 2022-08-01 64064 2022
6 Under 15 Years 2022-07-01 78914 2022
7 15-19 Years 2022-12-01 43167 2022
8 15-19 Years 2022-11-01 16801 2022
9 15-19 Years 2022-10-01 21290 2022
10 15-19 Years 2022-09-01 19359 2022
# … with 38 more rows
4.2. Preparations for visualizations
To find the age range of most visitors:
visitorsAge <- vaByDemoAge %>%
group_by(AgeGroup) %>%
summarise(Visitors = sum(Visitors))
visitorsAge <- visitorsAge[order(visitorsAge$Visitors, decreasing = TRUE), ]
mostAge <- head(visitorsAge$AgeGroup, 1)
mostAge[1] "25-34 Years"
4.3. Visualizations
pacman::p_load(ggplot2, plotly, timetk, ggHoriPlot, ggridges, gganimate)vaByDemoGender <- vaByDemoGender %>%
mutate(Month = month(Period))
vaByDemoGender# A tibble: 12 × 5
Gender Period Visitors Year Month
<chr> <date> <dbl> <dbl> <dbl>
1 Males 2022-12-01 457823 2022 12
2 Males 2022-11-01 428447 2022 11
3 Males 2022-10-01 423112 2022 10
4 Males 2022-09-01 419664 2022 9
5 Males 2022-08-01 375963 2022 8
6 Males 2022-07-01 371596 2022 7
7 Females 2022-12-01 473507 2022 12
8 Females 2022-11-01 387885 2022 11
9 Females 2022-10-01 393712 2022 10
10 Females 2022-09-01 362542 2022 9
11 Females 2022-08-01 352779 2022 8
12 Females 2022-07-01 355136 2022 7
vaByDemoAge <- vaByDemoAge %>%
mutate(Month = month(Period))
vaByDemoAge# A tibble: 48 × 5
AgeGroup Period Visitors Year Month
<chr> <date> <dbl> <dbl> <dbl>
1 Under 15 Years 2022-12-01 123734 2022 12
2 Under 15 Years 2022-11-01 48421 2022 11
3 Under 15 Years 2022-10-01 63829 2022 10
4 Under 15 Years 2022-09-01 50827 2022 9
5 Under 15 Years 2022-08-01 64064 2022 8
6 Under 15 Years 2022-07-01 78914 2022 7
7 15-19 Years 2022-12-01 43167 2022 12
8 15-19 Years 2022-11-01 16801 2022 11
9 15-19 Years 2022-10-01 21290 2022 10
10 15-19 Years 2022-09-01 19359 2022 9
# … with 38 more rows
Time Series Plot for Gender
vaByDemoGender %>%
plot_time_series(Period, Visitors, .color_var = Gender, .smooth = FALSE, .interactive = TRUE)Another way to plot time series:
p <- vaByDemoGender %>%
ggplot(aes(x = Period, y = Visitors, color = Gender)) +
geom_line() +
labs(x = "Period", y = "No. Of Visitors", title = NULL) +
theme_bw() +
theme(axis.text = element_text(size = 7), axis.title = element_text(size = 7), legend.position = "none")
p %>%
ggplotly()Time Series Plot for Age Groups
vaByDemoAge %>%
plot_time_series(Period, Visitors, .color_var = AgeGroup, .smooth = FALSE, .interactive = TRUE)Horizon Plot by Gender
vaByDemoGender %>%
group_by(Period, Gender) %>%
summarise(Visitors = sum(Visitors)) %>%
ggplot() +
geom_horizon(aes(x = Period, y = Visitors), origin = "midpoint", horizonscale = 6) +
facet_grid(Gender~.) +
scale_fill_hcl(palette = "RdBu") +
scale_x_date(expand = c(0,0), date_breaks = "1 month", date_labels = " %b %y") +
theme_minimal() +
theme(panel.spacing.y = unit(0, "lines"),
strip.text.y = element_text(size = 10, angle = 0, hjust = 0),
legend.position = 'none',
axis.text.y = element_blank(),
axis.text.x = element_text(size = 7),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
axis.ticks.y = element_blank(),
panel.border = element_blank())
Horizon Plot by Age Group
vaByDemoAge %>%
group_by(Period, AgeGroup) %>%
summarise(Visitors = sum(Visitors)) %>%
ggplot() +
geom_horizon(aes(x = Period, y = Visitors), origin = "midpoint", horizonscale = 6) +
facet_grid(AgeGroup~.) +
scale_fill_hcl(palette = "RdBu") +
scale_x_date(expand = c(0,0), date_breaks = "1 month", date_labels = " %b %y") +
theme_minimal() +
theme(panel.spacing.y = unit(0, "lines"),
strip.text.y = element_text(size = 10, angle = 0, hjust = 0),
legend.position = 'none',
axis.text.y = element_blank(),
axis.text.x = element_text(size = 7),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
axis.ticks.y = element_blank(),
panel.border = element_blank())
Heatmap by Gender
figHeatmapPlot <- ggplot(data = vaByDemoGender) +
geom_tile(aes(x = Month, y = Gender, fill = Visitors)) +
theme_bw() +
theme(legend.position = "none") +
scale_x_continuous(breaks = seq_along(month.name), labels = month.abb)
ggplotly(figHeatmapPlot)Heatmap by Age Group
figHeatmapPlot <- ggplot(data = vaByDemoAge) +
geom_tile(aes(x = Month, y = AgeGroup, fill = Visitors)) +
theme_bw() +
theme(legend.position = "none") +
scale_x_continuous(breaks = seq_along(month.name), labels = month.abb)
ggplotly(figHeatmapPlot)xxx
5. Visitors’ Length of Stay
5.1 Cleansing the dataset
This file
- contains the international visitor arrivals by length of stay (monthly for the last 6 months of dec)
- excludes arrivals of Malaysians by land
- all numbers are counts
The following was done to cleanup the vaByStayRaw dataframe:
- remove the bottom few rows as they were unnecessary for our visualizations
- rename fields and rearrange the columns
- next, we will pivot date (month-year) and the number of visitors to reduce the number of columns
- we will also replace all mention of “(Number)” as we do not need this text explicitly stated in our column
vaByStay <- slice(vaByStayRaw, 2:15)
vaByStay# A tibble: 14 × 7
`Data Series` `2022 Dec` `2022 Nov` 2022 …¹ 2022 …² 2022 …³ 2022 …⁴
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Under 1 Day (Number) 166698 146629 136573 134931 125981 108581
2 1 Day (Number) 114412 108266 102074 97676 96875 88056
3 2 Days (Number) 118019 124149 117744 104369 101746 91996
4 3 Days (Number) 129601 132535 140122 114944 112391 109801
5 4 Days (Number) 92109 86753 98830 76710 74885 75171
6 5 Days (Number) 54792 51619 58714 43580 41261 44932
7 6 Days (Number) 34055 30955 37834 27037 26212 29829
8 7 Days (Number) 23613 21826 28066 19442 19651 23829
9 8-10 Days (Number) 30053 25531 33328 24410 26013 30521
10 11-14 Days (Number) 18078 15127 18215 14734 17264 19508
11 15 Days & Over (Number) 34224 32414 34479 33607 43560 44098
12 15-29 Days (Number) 22915 21036 23278 21194 27657 29614
13 30-59 Days (Number) 7458 7646 7415 8175 11112 9747
14 60 Days & Over (Number) 3851 3732 3786 4238 4791 4737
# … with abbreviated variable names ¹`2022 Oct`, ²`2022 Sep`, ³`2022 Aug`,
# ⁴`2022 Jul`
Cleansing the dataframe:
colnames(vaByStay)[1] <- "Duration"
vaByStay <- vaByStay %>%
pivot_longer(cols = !c("Duration"), names_to = "Period", values_to = "Visitors") %>%
mutate(Period = as.Date(paste(Period, "01"), "%Y %B %d")) %>%
mutate(Year = year(Period)) %>%
mutate(Month = month(Period)) %>%
mutate(Duration = str_replace_all(Duration, " \\(Number\\)", ""))
vaByStay# A tibble: 84 × 5
Duration Period Visitors Year Month
<chr> <date> <dbl> <dbl> <dbl>
1 Under 1 Day 2022-12-01 166698 2022 12
2 Under 1 Day 2022-11-01 146629 2022 11
3 Under 1 Day 2022-10-01 136573 2022 10
4 Under 1 Day 2022-09-01 134931 2022 9
5 Under 1 Day 2022-08-01 125981 2022 8
6 Under 1 Day 2022-07-01 108581 2022 7
7 1 Day 2022-12-01 114412 2022 12
8 1 Day 2022-11-01 108266 2022 11
9 1 Day 2022-10-01 102074 2022 10
10 1 Day 2022-09-01 97676 2022 9
# … with 74 more rows
xxx
5.2 Preparations for visualizations
To find the avg days most visitors stayed:
visitorsStayed <- vaByStay %>%
group_by(Duration) %>%
summarise(Visitors = sum(Visitors))
visitorsStayed <- visitorsStayed[order(visitorsStayed$Visitors, decreasing = TRUE), ]
mostStayed <- head(visitorsStayed$Duration, 1)
mostStayed[1] "Under 1 Day"
xxx
5.3 Visualizations
Time Series Plot for Gender
vaByStay %>%
plot_time_series(Period, Visitors, .smooth = FALSE, .color_var = Duration, .interactive = TRUE)Time Series Boxplot for Length of Stay
vaByStay %>%
plot_time_series_boxplot(Period, Visitors, .smooth = FALSE, .period = "1 month", .interactive = TRUE, .title = NULL)Time Series Analysis of Length of Stay
p <- vaByStay %>%
ggplot(aes(x = Period, y = Visitors, color = Duration)) +
geom_line() +
labs(x = "Period", y = "No. Of Visitors", title = NULL) +
theme_bw() +
theme(axis.text = element_text(size = 7), axis.title = element_text(size = 7))
p %>%
ggplotly()Ridgeline Plot by Length of Stay
vaByStay %>%
ggplot(aes(x = Visitors, y = Duration, fill = after_stat(x))) +
geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
scale_fill_viridis_c(name = "Visitors", option = "C") +
theme_bw()
pacman::p_load(viridis, hrbrthemes)Area Chart by Length of Stay
areaChart <- vaByStay %>%
ggplot(aes(x = Period, y = Visitors, fill = Duration)) +
geom_area() +
scale_fill_viridis(discrete = TRUE) +
theme(legend.position="none") +
theme_bw() +
theme(legend.position="none")
ggplotly(areaChart)Horizon Plot by Length of Stay
vaByStay %>%
group_by(Period, Duration) %>%
summarise(Visitors = sum(Visitors)) %>%
ggplot() +
geom_horizon(aes(x = Period, y = Visitors), origin = "midpoint", horizonscale = 6) +
facet_grid(Duration~.) +
scale_fill_hcl(palette = "RdBu") +
scale_x_date(expand = c(0,0), date_breaks = "1 month", date_labels = " %b %y") +
theme_minimal() +
theme(panel.spacing.y = unit(0, "lines"),
strip.text.y = element_text(size = 10, angle = 0, hjust = 0),
legend.position = 'none',
axis.text.y = element_blank(),
axis.text.x = element_text(size = 7),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
axis.ticks.y = element_blank(),
panel.border = element_blank())
Heatmap by Length of Stay
figHeatmapLOS <- ggplot(vaByStay) +
geom_tile(aes(x = Month, y = Duration, fill = Visitors)) +
theme_bw() +
theme(legend.position = "none") +
scale_x_continuous(breaks = seq_along(month.name), labels = month.abb)
ggplotly(figHeatmapLOS)Using ggTimeSeries
pacman::p_load(ggTimeSeries)Streamgraph for Length of Stay (doesn’t look that nice)
p1 <- vaByStay %>%
ggplot(aes(x = Period, y = Visitors, group = Duration, fill = Duration)) +
stat_steamgraph()
ggplotly(p1)using ggbump
pacman::p_load(ggbump)Bump Chart for Length of Stay
px <- vaByStay %>%
ggplot(aes(x = Period, y = Visitors, color = Duration)) +
geom_bump(size = 1) +
geom_point(size = 1.2)
ggplotly(px)xxx